In [1]:
import scanpy as sc
import copy
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.widgets import PolygonSelector
from matplotlib.path import Path
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
In [1]:
import numpy as np

mel = np.load('/QRISdata/Q1851/STOmics_SAW_Outputs/lncRNA/Melanoma.npy')
mel
Out[1]:
array([[ 2380, 11100],
       [ 2400,  6960],
       [ 2400, 10260],
       ...,
       [10120, 18980],
       [10140, 18920],
       [10140, 18960]], dtype=uint32)
In [27]:
import scanpy as sc

lnc_bin20 = sc.read_h5ad("lnc_bin20_sc_processed_leiden.h5ad")
#IBD_bin20 = sc.read_h5ad("/QRISdata/Q1851/STOmics_SAW_Outputs/IBD/visualization/C05096G2.bin20_1.0.h5ad")
lnc_bin20.var['ENS']=lnc_bin20.var.index
lnc_bin20.var.index=lnc_bin20.var['real_gene_name']

lnc_bin20.raw.var['ENS']=lnc_bin20.raw.var.index
lnc_bin20.raw.var.index=lnc_bin20.raw.var['real_gene_name']
lnc_bin20.var_names = lnc_bin20.var['real_gene_name']

lnc_bin20
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.
  warnings.warn("Transforming to str index.", ImplicitModificationWarning)
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:1840: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
  utils.warn_names_duplicates("var")
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:887: UserWarning: 
AnnData expects .var.index to contain strings, but got values like:
    ['TSPAN6', 'TNMD', 'DPM1', 'SCYL3', 'C1orf112']

    Inferred to be: categorical

  names = self._prep_dim_index(names, "var")
Out[27]:
AnnData object with n_obs × n_vars = 512623 × 206923
    obs: 'total_counts', 'n_genes_by_counts', 'pct_counts_mt', 'leiden', 'orig.ident', 'x', 'y', 'leiden_res_0.01', 'leiden_res_0.10', 'leiden_res_0.50'
    var: 'real_gene_name', 'n_cells', 'n_counts', 'mean_umi', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'ENS'
    uns: 'bin_size', 'bin_type', 'gene_exp_leiden', 'hvg', 'key_record', 'leiden', 'leiden_resolution', 'log1p', 'merged', 'neighbors', 'omics', 'pca', 'pca_variance_ratio', 'rank_genes_groups', 'resolution', 'result_keys', 'sn', 'umap'
    obsm: 'X_pca', 'X_umap', 'spatial'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'connectivities', 'distances'
In [2]:
import pickle

# Load the object from the pickle file
with open("/QRISdata/Q1851/STOmics_SAW_Outputs/lncRNA/visualization/lncRNA_bin20_annotated_cancer.pkl", "rb") as f:
    lnc_bin20 = pickle.load(f)
lnc_bin20.var['ENS']=lnc_bin20.var.index
lnc_bin20.var.index=lnc_bin20.var['real_gene_name']

lnc_bin20.raw.var['ENS']=lnc_bin20.raw.var.index
lnc_bin20.raw.var.index=lnc_bin20.raw.var['real_gene_name']
lnc_bin20.var_names = lnc_bin20.var['real_gene_name']

lnc_bin20
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:887: UserWarning: 
AnnData expects .var.index to contain strings, but got values like:
    ['TSPAN6', 'TNMD', 'DPM1', 'SCYL3', 'C1orf112']

    Inferred to be: categorical

  names = self._prep_dim_index(names, "var")
Out[2]:
AnnData object with n_obs × n_vars = 512623 × 206923
    obs: 'total_counts', 'n_genes_by_counts', 'pct_counts_mt', 'leiden', 'orig.ident', 'x', 'y', 'Cancer'
    var: 'real_gene_name', 'n_cells', 'n_counts', 'mean_umi', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'ENS'
    uns: 'bin_size', 'bin_type', 'gene_exp_leiden', 'hvg', 'key_record', 'leiden_resolution', 'merged', 'neighbors', 'omics', 'pca_variance_ratio', 'rank_genes_groups', 'resolution', 'result_keys', 'sn'
    obsm: 'X_pca', 'X_umap', 'spatial'
    obsp: 'connectivities', 'distances'
In [6]:
lnc_bin20.uns['omics']
Out[6]:
array(['Transcriptomics'], dtype=object)
In [29]:
lnc_bin20 = lnc_bin20[:, ~lnc_bin20.var_names.str.startswith("cuTAR")].copy()
lnc_bin20
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.
  warnings.warn("Transforming to str index.", ImplicitModificationWarning)
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.
  warnings.warn("Transforming to str index.", ImplicitModificationWarning)
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:1840: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
  utils.warn_names_duplicates("var")
Out[29]:
AnnData object with n_obs × n_vars = 512623 × 34559
    obs: 'total_counts', 'n_genes_by_counts', 'pct_counts_mt', 'leiden', 'orig.ident', 'x', 'y', 'Cancer'
    var: 'real_gene_name', 'n_cells', 'n_counts', 'mean_umi', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'ENS'
    uns: 'bin_size', 'bin_type', 'gene_exp_leiden', 'hvg', 'key_record', 'leiden_resolution', 'merged', 'neighbors', 'omics', 'pca_variance_ratio', 'rank_genes_groups', 'resolution', 'result_keys', 'sn'
    obsm: 'X_pca', 'X_umap', 'spatial'
    obsp: 'connectivities', 'distances'
In [30]:
import anndata
import pandas as pd

# Assume lnc_bin20 is your AnnData object
# Check current var_names and their type
print("Original var_names:", lnc_bin20.var_names[:5])  # Preview first 5
print("Type of var_names:", type(lnc_bin20.var_names))
print("Is categorical:", pd.api.types.is_categorical_dtype(lnc_bin20.var_names))

# Option 1: Convert var_names to strings and make unique
# This avoids the categorical issue by using a simple string index
lnc_bin20.var_names = lnc_bin20.var_names.astype(str)  # Convert to string
lnc_bin20.var_names_make_unique()  # Now this should work

# Verify the result
print("Updated var_names:", lnc_bin20.var_names[:5])  # Check first 5
print("Any duplicates:", lnc_bin20.var_names.has_duplicates)  # Should be False

# Optional: Save the modified AnnData object
#lnc_bin20.write_h5ad('lnc_bin20_unique.h5ad')
Original var_names: Index(['TSPAN6', 'TNMD', 'DPM1', 'SCYL3', 'C1orf112'], dtype='object', name='real_gene_name')
Type of var_names: <class 'pandas.core.indexes.base.Index'>
Is categorical: False
Updated var_names: Index(['TSPAN6', 'TNMD', 'DPM1', 'SCYL3', 'C1orf112'], dtype='object', name='real_gene_name')
Any duplicates: False
In [31]:
# Step 2: Check and fix var_names duplicates BEFORE setting .raw
print("Before fix:")
print("Any duplicates in adata.var_names:", lnc_bin20.var_names.has_duplicates)
dupes = lnc_bin20.var_names[lnc_bin20.var_names.duplicated()]
print("Duplicated gene names:")
print(dupes)

# Fix duplicates
lnc_bin20.var_names_make_unique()

# Step 3: Set .raw after fixing
lnc_bin20.raw = lnc_bin20

# Step 4: Confirm fix
print("\nAfter fix:")
print("Any duplicates in .raw.var_names:", lnc_bin20.raw.var_names.has_duplicates)
Before fix:
Any duplicates in adata.var_names: False
Duplicated gene names:
Index([], dtype='object', name='real_gene_name')

After fix:
Any duplicates in .raw.var_names: False
In [33]:
lnc_bin20.obs
Out[33]:
total_counts n_genes_by_counts pct_counts_mt leiden orig.ident x y Cancer
10222022175580 2 1 0.0 9 sample 2380 11100 Melanoma
10307921517360 1 1 0.0 9 sample 2400 6960 Melanoma
10307921520660 1 1 0.0 9 sample 2400 10260 Melanoma
10307921521540 2 1 0.0 9 sample 2400 11140 Melanoma
10307921521740 1 1 0.0 9 sample 2400 11340 Melanoma
... ... ... ... ... ... ... ... ...
94832877901060 4 3 0.0 8 sample 22080 5380 GBM
94832877901080 1 1 0.0 9 sample 22080 5400 GBM
94832877901100 5 3 0.0 8 sample 22080 5420 GBM
94832877901120 4 2 0.0 9 sample 22080 5440 GBM
94832877901140 1 1 0.0 9 sample 22080 5460 GBM

512623 rows × 8 columns

In [35]:
adata_mel = lnc_bin20[lnc_bin20.obs['Cancer'] == 'Melanoma'].copy()
adata_mel
Out[35]:
AnnData object with n_obs × n_vars = 194752 × 34559
    obs: 'total_counts', 'n_genes_by_counts', 'pct_counts_mt', 'leiden', 'orig.ident', 'x', 'y', 'Cancer'
    var: 'real_gene_name', 'n_cells', 'n_counts', 'mean_umi', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'ENS'
    uns: 'bin_size', 'bin_type', 'gene_exp_leiden', 'hvg', 'key_record', 'leiden_resolution', 'merged', 'neighbors', 'omics', 'pca_variance_ratio', 'rank_genes_groups', 'resolution', 'result_keys', 'sn'
    obsm: 'X_pca', 'X_umap', 'spatial'
    obsp: 'connectivities', 'distances'
In [36]:
# Normalizing to median total counts
sc.pp.normalize_total(adata_mel)
# Logarithmize the data
sc.pp.log1p(adata_mel)
fig, axs = plt.subplots(1, 4, figsize=(15, 4))
sns.histplot(adata_mel.obs["total_counts"], kde=False, ax=axs[0])
sns.histplot(
    adata_mel.obs["total_counts"][adata_mel.obs["total_counts"] < 10000],
    kde=False,
    bins=40,
    ax=axs[1],
)
sns.histplot(adata_mel.obs["n_genes_by_counts"], kde=False, bins=60, ax=axs[2])
sns.histplot(
    adata_mel.obs["n_genes_by_counts"][adata_mel.obs["n_genes_by_counts"] < 4000],
    kde=False,
    bins=60,
    ax=axs[3],
)
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/preprocessing/_normalization.py:196: UserWarning: Some cells have zero counts
  warn(UserWarning('Some cells have zero counts'))
Out[36]:
<Axes: xlabel='n_genes_by_counts', ylabel='Count'>
No description has been provided for this image
In [37]:
sc.pp.filter_cells(adata_mel, max_counts=200)
adata_mel = adata_mel[adata_mel.obs["pct_counts_mt"] < 20].copy()
print(f"#cells after MT filter: {adata_mel.n_obs}")
sc.pp.filter_genes(adata_mel, min_cells=10)
adata_mel
#cells after MT filter: 194554
Out[37]:
AnnData object with n_obs × n_vars = 194554 × 23230
    obs: 'total_counts', 'n_genes_by_counts', 'pct_counts_mt', 'leiden', 'orig.ident', 'x', 'y', 'Cancer', 'n_counts'
    var: 'real_gene_name', 'n_cells', 'n_counts', 'mean_umi', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'ENS'
    uns: 'bin_size', 'bin_type', 'gene_exp_leiden', 'hvg', 'key_record', 'leiden_resolution', 'merged', 'neighbors', 'omics', 'pca_variance_ratio', 'rank_genes_groups', 'resolution', 'result_keys', 'sn', 'log1p'
    obsm: 'X_pca', 'X_umap', 'spatial'
    obsp: 'connectivities', 'distances'
In [38]:
sc.tl.pca(adata_mel, use_highly_variable=False)
sc.pp.neighbors(adata_mel)
sc.tl.umap(adata_mel)
In [39]:
sc.tl.leiden(adata_mel, key_added="leiden_res_0.0001", resolution=0.0001)
sc.pl.umap(adata_mel, color=["leiden_res_0.0001"],legend_loc="on data")
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
No description has been provided for this image
In [40]:
sc.pl.spatial(adata_mel, color=["leiden_res_0.0001"], spot_size=50)
No description has been provided for this image
In [41]:
sc.tl.leiden(adata_mel, key_added="leiden_res_0.001", resolution=0.001)
sc.pl.umap(adata_mel, color=["leiden_res_0.001"],legend_loc="on data")
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
No description has been provided for this image
In [42]:
sc.pl.spatial(adata_mel, color=["leiden_res_0.001"], spot_size=50)
No description has been provided for this image
In [47]:
mel_marker_genes = {
    #"CRC": ["CEACAM5", "CDX2", "EPCAM", "KRT20", "MUC2", "CLDN1"],
    "melanoma": ["MITF", "TYR", "DCT", "MLANA", "PMEL", "SOX10"]
}
# Make unique if duplicates exist
sc.pl.dotplot(
    adata_mel,
    var_names=mel_marker_genes,
    groupby='leiden_res_0.001',
    standard_scale='var',
    dot_max=0.5,
    color_map='Reds',
    show=True, use_raw=True
)
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:747: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
  dot_ax.scatter(x, y, **kwds)
No description has been provided for this image
In [46]:
# Identify marker genes for each cluster
sc.tl.rank_genes_groups(adata_mel, groupby="leiden_res_0.001", method="t-test")#, use_raw=False)  # or method="wilcoxon"
sc.pl.rank_genes_groups_dotplot(adata_mel, n_genes=10, groupby="leiden_res_0.001")#, use_raw=False)
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:747: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
  dot_ax.scatter(x, y, **kwds)
No description has been provided for this image
In [48]:
import pickle

# Save
with open("lnc_Melanpma.pkl", "wb") as f:
    pickle.dump(adata_mel, f)
In [77]:
import pickle

# Save
with open("lnc_Melanpma.pkl", "rb") as f:
    adata_mel20=pickle.load(f)
In [79]:
mel_marker_genes = {
    #"CRC": ["CEACAM5", "CDX2", "EPCAM", "KRT20", "MUC2", "CLDN1"],
    "melanoma": ["MITF", "TYR", "DCT", "MLANA", "PMEL", "SOX10","KRT5","PRAME","S100B"]
}
# Make unique if duplicates exist
sc.pl.dotplot(
    adata_mel20,
    var_names=mel_marker_genes,
    groupby='leiden_res_0.001',
    standard_scale='var',
    color_map='Reds',
    show=True, use_raw=False, swap_axes=True
)
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:747: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
  dot_ax.scatter(x, y, **kwds)
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 

bin50¶

In [2]:
import pickle

# Load the object from the pickle file
with open("/QRISdata/Q1851/STOmics_SAW_Outputs/lncRNA/visualization/lncRNA_bin50_annotated_cancer.pkl", "rb") as f:
    lnc_bin50 = pickle.load(f)
lnc_bin50.var['ENS']=lnc_bin50.var.index
lnc_bin50.var.index=lnc_bin50.var['real_gene_name']

lnc_bin50.raw.var['ENS']=lnc_bin50.raw.var.index
lnc_bin50.raw.var.index=lnc_bin50.raw.var['real_gene_name']
lnc_bin50.var_names = lnc_bin50.var['real_gene_name']

lnc_bin50
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:887: UserWarning: 
AnnData expects .var.index to contain strings, but got values like:
    ['TSPAN6', 'TNMD', 'DPM1', 'SCYL3', 'C1orf112']

    Inferred to be: categorical

  names = self._prep_dim_index(names, "var")
Out[2]:
AnnData object with n_obs × n_vars = 83575 × 206923
    obs: 'total_counts', 'n_genes_by_counts', 'pct_counts_mt', 'leiden', 'orig.ident', 'x', 'y', 'Cancer'
    var: 'real_gene_name', 'n_cells', 'n_counts', 'mean_umi', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'ENS'
    uns: 'bin_size', 'bin_type', 'gene_exp_leiden', 'hvg', 'key_record', 'leiden_resolution', 'merged', 'neighbors', 'omics', 'pca_variance_ratio', 'rank_genes_groups', 'resolution', 'result_keys', 'sn'
    obsm: 'X_pca', 'X_umap', 'spatial'
    obsp: 'connectivities', 'distances'
In [3]:
lnc_bin50_all=lnc_bin50
lnc_bin50 = lnc_bin50[:, ~lnc_bin50.var_names.str.startswith("cuTAR")].copy()
lnc_bin50
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.
  warnings.warn("Transforming to str index.", ImplicitModificationWarning)
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.
  warnings.warn("Transforming to str index.", ImplicitModificationWarning)
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:1840: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
  utils.warn_names_duplicates("var")
Out[3]:
AnnData object with n_obs × n_vars = 83575 × 34559
    obs: 'total_counts', 'n_genes_by_counts', 'pct_counts_mt', 'leiden', 'orig.ident', 'x', 'y', 'Cancer'
    var: 'real_gene_name', 'n_cells', 'n_counts', 'mean_umi', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'ENS'
    uns: 'bin_size', 'bin_type', 'gene_exp_leiden', 'hvg', 'key_record', 'leiden_resolution', 'merged', 'neighbors', 'omics', 'pca_variance_ratio', 'rank_genes_groups', 'resolution', 'result_keys', 'sn'
    obsm: 'X_pca', 'X_umap', 'spatial'
    obsp: 'connectivities', 'distances'
In [4]:
lnc_bin50
Out[4]:
AnnData object with n_obs × n_vars = 83575 × 34559
    obs: 'total_counts', 'n_genes_by_counts', 'pct_counts_mt', 'leiden', 'orig.ident', 'x', 'y', 'Cancer'
    var: 'real_gene_name', 'n_cells', 'n_counts', 'mean_umi', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'ENS'
    uns: 'bin_size', 'bin_type', 'gene_exp_leiden', 'hvg', 'key_record', 'leiden_resolution', 'merged', 'neighbors', 'omics', 'pca_variance_ratio', 'rank_genes_groups', 'resolution', 'result_keys', 'sn'
    obsm: 'X_pca', 'X_umap', 'spatial'
    obsp: 'connectivities', 'distances'
In [ ]:
sc.pl.spatial(lnc_bin50,color=['cuTAR100799','cuTAR48562','cuTAR81095','cuTAR167236'], spot_size=50, vmax=2, cmap="Reds")
In [60]:
sc.pl.spatial(lnc_bin50,color=['cuTAR141997','cuTAR19863','cuTAR86784','cuTAR30875'], spot_size=70, vmax=2, cmap="Reds")
No description has been provided for this image
In [69]:
# pan-cancer
sc.pl.spatial(lnc_bin50,color=['cuTAR262781','cuTAR17199','cuTAR162708','cuTAR87324'], spot_size=80, vmax=2, cmap="Reds")
No description has been provided for this image
In [11]:
sc.pl.spatial(lnc_bin50,color=['cuTAR30875','cuTAR48563'], spot_size=80, vmax=2, cmap="Reds")
No description has been provided for this image
In [17]:
sc.pl.spatial(lnc_bin50,color=['cuTAR30875'], spot_size=80, vmax=1, cmap="inferno")
No description has been provided for this image
In [ ]:
 
In [75]:
import numpy as np
import pandas as pd

# Step 1: Ensure gene names are strings and unique
lnc_bin50.var_names = pd.Index(lnc_bin50.var_names.astype(str))
lnc_bin50.var_names_make_unique()

# Step 2: Get gene names and filter for cuTAR
genes = lnc_bin50.var_names
cutar_mask = genes.str.startswith("cuTAR")
cutar_genes = genes[cutar_mask]

# Step 3: Subset expression matrix for those genes
cutar_counts = lnc_bin50[:, cutar_genes].X

# Step 4: Count number of cells each gene is expressed in
if hasattr(cutar_counts, 'getnnz'):  # sparse
    cell_counts = cutar_counts.getnnz(axis=0)
else:
    cell_counts = np.sum(cutar_counts > 0, axis=0)

# Step 5: Count genes expressed in at least 3 cells
num_cutars_expressed3 = np.sum(cell_counts >= 3)
num_cutars_expressed5 = np.sum(cell_counts >= 5)
num_cutars_expressed10 = np.sum(cell_counts >= 10)
num_cutars_expressed100 = np.sum(cell_counts >= 100)
print("cuTAR genes expressed in ≥3 cells:", num_cutars_expressed3)
print("cuTAR genes expressed in ≥5 cells:", num_cutars_expressed5)
print("cuTAR genes expressed in ≥10 cells:", num_cutars_expressed10)
print("cuTAR genes expressed in ≥100 cells:", num_cutars_expressed100)
cuTAR genes expressed in ≥3 cells: 116860
cuTAR genes expressed in ≥5 cells: 88045
cuTAR genes expressed in ≥10 cells: 52417
cuTAR genes expressed in ≥100 cells: 1691
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.
  warnings.warn("Transforming to str index.", ImplicitModificationWarning)
In [9]:
# Step 6: Save the list of cuTARs expressed in ≥3 cells
cutar_genes_expressed3 = cutar_genes[cell_counts >= 3]
cutar_genes_expressed3.to_series().to_csv("cuTAR_expressed_in_3_or_more_cells.txt", index=False, header=False)
In [ ]:
 
In [37]:
import pandas as pd
import numpy as np

# List of target cuTAR genes
target_genes = ['cuTAR141997','cuTAR288960','cuTAR94762','cuTAR280980','cuTAR201736', 'cuTAR42228',
                'cuTAR19863', 'cuTAR86784', 'cuTAR78711']

# Ensure gene names are strings and unique
lnc_bin50.var_names = pd.Index(lnc_bin50.var_names.astype(str))
lnc_bin50.var_names_make_unique()

# Filter to existing genes only
existing_genes = [g for g in target_genes if g in lnc_bin50.var_names]

if not existing_genes:
    print("None of the target genes found in the dataset.")
else:
    # Subset expression matrix for the selected genes
    expr = lnc_bin50[:, existing_genes].X

    # Convert to dense if sparse
    if hasattr(expr, 'toarray'):
        expr = expr.toarray()

    # Create DataFrame: rows = cells, columns = genes
    expr_df = pd.DataFrame(expr, columns=existing_genes, index=lnc_bin50.obs_names)

    # Add Cancer info
    expr_df['Cancer'] = lnc_bin50.obs['Cancer'].values

    # Convert expression to binary (expressed > 0)
    binary_expr = expr_df[existing_genes] > 0
    binary_expr['Cancer'] = expr_df['Cancer']

    # Count number of expressing cells per gene per Cancer group
    summary = binary_expr.groupby('Cancer')[existing_genes].sum()

    print("Number of cells expressing each gene per Cancer group:")
    print(summary)


import seaborn as sns
import matplotlib.pyplot as plt

# Reset index so 'Cancer' becomes a column for plotting
summary_reset = summary.reset_index()

# Convert to long format for seaborn
plot_df = summary_reset.melt(id_vars="Cancer", 
                             var_name="Gene", 
                             value_name="Cell Count")

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(data=plot_df, x="Cancer", y="Cell Count", hue="Gene")

plt.title("Number of Cells Expressing Each cuTAR Gene per Cancer Sample")
plt.xticks(rotation=45)
plt.tight_layout()
plt.legend(title="cuTAR Gene")
plt.show()
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.
  warnings.warn("Transforming to str index.", ImplicitModificationWarning)
Number of cells expressing each gene per Cancer group:
                   cuTAR141997  cuTAR288960  cuTAR94762  cuTAR280980  \
Cancer                                                                 
Colorectal_cancer           16           11           4            0   
GBM                         38           19           5            1   
Melanoma                    56           18          10            0   

                   cuTAR201736  cuTAR42228  cuTAR19863  cuTAR86784  cuTAR78711  
Cancer                                                                          
Colorectal_cancer           11           2           2           2           0  
GBM                         32           2           6          11           2  
Melanoma                    25          10           3           7           0  
No description has been provided for this image
In [49]:
target_genes = ['cuTAR141997','cuTAR94762', 'cuTAR42228','cuTAR59116','cuTAR291293','cuTAR30875','cuTAR23054','cuTAR293520']

# Filter for genes that exist in var_names
existing_genes = [g for g in target_genes if g in lnc_bin50.var_names]

if not existing_genes:
    print("None of the target cuTAR genes found.")
else:
    # Subset expression
    expr = lnc_bin50[:, existing_genes].X

    # Convert sparse matrix to dense if needed
    if hasattr(expr, 'toarray'):
        expr = expr.toarray()

    # Create DataFrame (cells x genes)
    expr_df = pd.DataFrame(expr, columns=existing_genes, index=lnc_bin50.obs_names)

    # Add Cancer group info
    expr_df['Cancer'] = lnc_bin50.obs['Cancer'].values

    # Group by Cancer and compute total expression (sum)
    total_expr = expr_df.groupby('Cancer')[existing_genes].sum().reset_index()

    # Melt for plotting
    plot_df = total_expr.melt(id_vars='Cancer', var_name='Gene', value_name='Total Expression')

    # Plot total expression
    plt.figure(figsize=(10, 6))
    sns.barplot(data=plot_df, x='Cancer', y='Total Expression', hue='Gene')

    plt.title("Total Expression of cuTAR Genes per Cancer Group")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.legend(title="Gene")
    plt.show()
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.
  warnings.warn("Transforming to str index.", ImplicitModificationWarning)
No description has been provided for this image
In [59]:
target_genes = ['cuTAR141997','cuTAR94762', 'cuTAR42228','cuTAR59116','cuTAR291293','cuTAR30875','cuTAR293520']


# Filter to only existing genes
existing_genes = [g for g in target_genes if g in lnc_bin50.var_names]

if not existing_genes:
    print("None of the target cuTAR genes found.")
else:
    # Get expression matrix
    expr = lnc_bin50[:, existing_genes].X

    # Convert to dense if sparse
    if hasattr(expr, 'toarray'):
        expr = expr.toarray()

    # Create DataFrame: cells x genes
    expr_df = pd.DataFrame(expr, columns=existing_genes, index=lnc_bin50.obs_names)

    # Add Cancer group/sample info
    expr_df['Cancer'] = lnc_bin50.obs['Cancer'].values

    # Group by Cancer and compute total expression
    total_expr = expr_df.groupby('Cancer')[existing_genes].sum().reset_index()

    # Reshape for plotting
    plot_df = total_expr.melt(id_vars='Cancer', var_name='Gene', value_name='Total Expression')

    # Plot: genes on X-axis, grouped by Cancer
    plt.figure(figsize=(10, 6))
    sns.barplot(data=plot_df, x='Gene', y='Total Expression', hue='Cancer')

    plt.title("Total Expression of Melanoma specific cuTARs")
    plt.tight_layout()
    plt.legend(title='Sample')
    plt.show()
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.
  warnings.warn("Transforming to str index.", ImplicitModificationWarning)
No description has been provided for this image
In [5]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# ---- target genes ----
target_genes = ['cuTAR141997','cuTAR94762', 'cuTAR42228','cuTAR59116','cuTAR291293','cuTAR30875','cuTAR293520']

# Ensure gene names are strings and unique
lnc_bin50.var_names = pd.Index(lnc_bin50.var_names.astype(str))
lnc_bin50.var_names_make_unique()

# Filter to only existing genes
existing_genes = [g for g in target_genes if g in lnc_bin50.var_names]

if not existing_genes:
    print("None of the target cuTAR genes found.")
else:
    # Get expression matrix
    expr = lnc_bin50[:, existing_genes].X

    # Convert to dense if sparse
    if hasattr(expr, 'toarray'):
        expr = expr.toarray()

    # Create DataFrame: cells x genes
    expr_df = pd.DataFrame(expr, columns=existing_genes, index=lnc_bin50.obs_names)

    # Add Cancer group/sample info
    expr_df['Cancer'] = lnc_bin50.obs['Cancer'].values

    # Group by Cancer and compute total expression
    total_expr = expr_df.groupby('Cancer')[existing_genes].sum().reset_index()

    # Reshape for plotting
    plot_df = total_expr.melt(id_vars='Cancer', var_name='Gene', value_name='Total Expression')

    # ---- Plot and save as PDF ----
    plt.figure(figsize=(10, 6))
    sns.barplot(data=plot_df, x='Gene', y='Total Expression', hue='Cancer')
    plt.title("Total Expression of Melanoma specific cuTARs")
    plt.tight_layout()
    plt.legend(title='Sample')

    # Save
    output_pdf = "melanoma_cuTAR_expression_stomics.pdf"
    plt.savefig(output_pdf, format="pdf")
    plt.close()

    print(f"Saved plot to {output_pdf}")
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.
  warnings.warn("Transforming to str index.", ImplicitModificationWarning)
Saved plot to melanoma_cuTAR_expression_stomics.pdf
In [74]:
# CRC cuTARs

target_genes = ['cuTAR86784','cuTAR328192','cuTAR88740','cuTAR323949','cuTAR283862']


# Filter to only existing genes
existing_genes = [g for g in target_genes if g in lnc_bin50.var_names]

if not existing_genes:
    print("None of the target cuTAR genes found.")
else:
    # Get expression matrix
    expr = lnc_bin50[:, existing_genes].X

    # Convert to dense if sparse
    if hasattr(expr, 'toarray'):
        expr = expr.toarray()

    # Create DataFrame: cells x genes
    expr_df = pd.DataFrame(expr, columns=existing_genes, index=lnc_bin50.obs_names)

    # Add Cancer group/sample info
    expr_df['Cancer'] = lnc_bin50.obs['Cancer'].values

    # Group by Cancer and compute total expression
    total_expr = expr_df.groupby('Cancer')[existing_genes].sum().reset_index()

    # Reshape for plotting
    plot_df = total_expr.melt(id_vars='Cancer', var_name='Gene', value_name='Total Expression')

    # Plot: genes on X-axis, grouped by Cancer
    plt.figure(figsize=(10, 6))
    sns.barplot(data=plot_df, x='Gene', y='Total Expression', hue='Cancer')

    plt.title("Total Expression of CRC cuTARs")
    plt.tight_layout()
    plt.legend(title='Sample')
    plt.show()
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.
  warnings.warn("Transforming to str index.", ImplicitModificationWarning)
No description has been provided for this image
In [58]:
# pan cancer cuTARs
target_genes = ['cuTAR19863',
'cuTAR291690',
'cuTAR262781',
'cuTAR137818',
'cuTAR87324',
#'cuTAR170802',
'cuTAR51249',
'cuTAR129713',
'cuTAR100897',
'cuTAR171799',
'cuTAR301342',
'cuTAR162708',
'cuTAR162807',
'cuTAR186121']

#target_genes = ['cuTAR78711','cuTAR276859','cuTAR290541','cuTAR270431','cuTAR207309','cuTAR6939'] # GBM specific


# Filter to only existing genes
existing_genes = [g for g in target_genes if g in lnc_bin50.var_names]

if not existing_genes:
    print("None of the target cuTAR genes found.")
else:
    # Get expression matrix
    expr = lnc_bin50[:, existing_genes].X

    # Convert to dense if sparse
    if hasattr(expr, 'toarray'):
        expr = expr.toarray()

    # Create DataFrame: cells x genes
    expr_df = pd.DataFrame(expr, columns=existing_genes, index=lnc_bin50.obs_names)

    # Add Cancer group/sample info
    expr_df['Cancer'] = lnc_bin50.obs['Cancer'].values

    # Group by Cancer and compute total expression
    total_expr = expr_df.groupby('Cancer')[existing_genes].sum().reset_index()

    # Reshape for plotting
    plot_df = total_expr.melt(id_vars='Cancer', var_name='Gene', value_name='Total Expression')

    # Plot: genes on X-axis, grouped by Cancer
    plt.figure(figsize=(10, 6))
    sns.barplot(data=plot_df, x='Gene', y='Total Expression', hue='Cancer')

    plt.title("Total Expression of Pan-cancer cuTARs") 
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.legend(title='Sample')
    plt.show()
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.
  warnings.warn("Transforming to str index.", ImplicitModificationWarning)
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [4]:
import anndata
import pandas as pd


print("Original var_names:", lnc_bin50.var_names[:5])  # Preview first 5
print("Type of var_names:", type(lnc_bin50.var_names))
print("Is categorical:", pd.api.types.is_categorical_dtype(lnc_bin50.var_names))

# Option 1: Convert var_names to strings and make unique
# This avoids the categorical issue by using a simple string index
lnc_bin50.var_names = lnc_bin50.var_names.astype(str)  # Convert to string
lnc_bin50.var_names_make_unique()  # Now this should work

# Verify the result
print("Updated var_names:", lnc_bin50.var_names[:5])  # Check first 5
print("Any duplicates:", lnc_bin50.var_names.has_duplicates)  # Should be False
Original var_names: Index(['TSPAN6', 'TNMD', 'DPM1', 'SCYL3', 'C1orf112'], dtype='object', name='real_gene_name')
Type of var_names: <class 'pandas.core.indexes.base.Index'>
Is categorical: False
Updated var_names: Index(['TSPAN6', 'TNMD', 'DPM1', 'SCYL3', 'C1orf112'], dtype='object', name='real_gene_name')
Any duplicates: False
In [5]:
# Step 2: Check and fix var_names duplicates BEFORE setting .raw
print("Before fix:")
print("Any duplicates in adata.var_names:", lnc_bin50.var_names.has_duplicates)
dupes = lnc_bin50.var_names[lnc_bin50.var_names.duplicated()]
print("Duplicated gene names:")
print(dupes)

# Fix duplicates
lnc_bin50.var_names_make_unique()

# Step 3: Set .raw after fixing
lnc_bin50.raw = lnc_bin50

# Step 4: Confirm fix
print("\nAfter fix:")
print("Any duplicates in .raw.var_names:", lnc_bin50.raw.var_names.has_duplicates)
Before fix:
Any duplicates in adata.var_names: False
Duplicated gene names:
Index([], dtype='object', name='real_gene_name')

After fix:
Any duplicates in .raw.var_names: False
In [4]:
adata_mel = lnc_bin50[lnc_bin50.obs['Cancer'] == 'Melanoma'].copy()
adata_mel
Out[4]:
AnnData object with n_obs × n_vars = 31955 × 34559
    obs: 'total_counts', 'n_genes_by_counts', 'pct_counts_mt', 'leiden', 'orig.ident', 'x', 'y', 'Cancer'
    var: 'real_gene_name', 'n_cells', 'n_counts', 'mean_umi', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'ENS'
    uns: 'bin_size', 'bin_type', 'gene_exp_leiden', 'hvg', 'key_record', 'leiden_resolution', 'merged', 'neighbors', 'omics', 'pca_variance_ratio', 'rank_genes_groups', 'resolution', 'result_keys', 'sn'
    obsm: 'X_pca', 'X_umap', 'spatial'
    obsp: 'connectivities', 'distances'
In [23]:
sc.pl.spatial(adata_mel,color=['cuTAR94762','cuTAR288960'], spot_size=65, vmax=1, cmap="inferno", show=False)
Out[23]:
[<Axes: title={'center': 'cuTAR94762'}, xlabel='spatial1', ylabel='spatial2'>,
 <Axes: title={'center': 'cuTAR288960'}, xlabel='spatial1', ylabel='spatial2'>]
No description has been provided for this image
In [15]:
import matplotlib.pyplot as plt

with plt.rc_context({"figure.figsize": (8, 8), "figure.dpi": (300)}):
    sc.pl.spatial(adata_mel,color=['cuTAR94762'], spot_size=65, vmax=1, cmap="inferno", show=False)
    plt.savefig("/scratch/project/stseq/Prakrithi/STOMICS/cuTAR94762_mel.pdf", bbox_inches="tight")
No description has been provided for this image
In [11]:
sc.pl.spatial(lnc_bin50_all,color=['cuTAR78711','cuTAR276859'], spot_size=55, vmax=1, cmap="Reds", show=False)
Out[11]:
[<Axes: title={'center': 'cuTAR78711'}, xlabel='spatial1', ylabel='spatial2'>,
 <Axes: title={'center': 'cuTAR276859'}, xlabel='spatial1', ylabel='spatial2'>]
No description has been provided for this image
In [ ]:
 
In [31]:
import matplotlib.pyplot as plt

with plt.rc_context({"figure.figsize": (8, 8), "figure.dpi": (300)}):
    sc.pl.spatial(adata_mel,color=['cuTAR30875'], spot_size=65, vmax=1, cmap="inferno", show=False)
    plt.savefig("/scratch/project/stseq/Prakrithi/STOMICS/cuTAR30875_mel.pdf", bbox_inches="tight")
No description has been provided for this image
In [7]:
# Normalizing to median total counts
sc.pp.normalize_total(adata_mel)
# Logarithmize the data
sc.pp.log1p(adata_mel)
fig, axs = plt.subplots(1, 4, figsize=(15, 4))
sns.histplot(adata_mel.obs["total_counts"], kde=False, ax=axs[0])
sns.histplot(
    adata_mel.obs["total_counts"][adata_mel.obs["total_counts"] < 10000],
    kde=False,
    bins=40,
    ax=axs[1],
)
sns.histplot(adata_mel.obs["n_genes_by_counts"], kde=False, bins=60, ax=axs[2])
sns.histplot(
    adata_mel.obs["n_genes_by_counts"][adata_mel.obs["n_genes_by_counts"] < 4000],
    kde=False,
    bins=60,
    ax=axs[3],
)
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/preprocessing/_normalization.py:196: UserWarning: Some cells have zero counts
  warn(UserWarning('Some cells have zero counts'))
Out[7]:
<Axes: xlabel='n_genes_by_counts', ylabel='Count'>
No description has been provided for this image
In [8]:
sc.pp.filter_cells(adata_mel, max_counts=800)
adata_mel = adata_mel[adata_mel.obs["pct_counts_mt"] < 20].copy()
print(f"#cells after MT filter: {adata_mel.n_obs}")
sc.pp.filter_genes(adata_mel, min_cells=10)
adata_mel
#cells after MT filter: 31949
Out[8]:
AnnData object with n_obs × n_vars = 31949 × 23229
    obs: 'total_counts', 'n_genes_by_counts', 'pct_counts_mt', 'leiden', 'orig.ident', 'x', 'y', 'Cancer', 'n_counts'
    var: 'real_gene_name', 'n_cells', 'n_counts', 'mean_umi', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'ENS'
    uns: 'bin_size', 'bin_type', 'gene_exp_leiden', 'hvg', 'key_record', 'leiden_resolution', 'merged', 'neighbors', 'omics', 'pca_variance_ratio', 'rank_genes_groups', 'resolution', 'result_keys', 'sn', 'log1p'
    obsm: 'X_pca', 'X_umap', 'spatial'
    obsp: 'connectivities', 'distances'
In [9]:
sc.tl.pca(adata_mel, use_highly_variable=False)
sc.pp.neighbors(adata_mel)
sc.tl.umap(adata_mel)
2025-06-03 11:45:11.865623: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-06-03 11:45:13.703442: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-03 11:45:14.315555: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2025-06-03 11:45:14.315596: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2025-06-03 11:45:20.703254: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2025-06-03 11:45:20.703475: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
2025-06-03 11:45:20.703483: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
In [18]:
sc.tl.leiden(adata_mel, key_added="leiden_res_0.2", resolution=0.2)
sc.pl.umap(adata_mel, color=["leiden_res_0.2"],legend_loc="on data")
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
No description has been provided for this image
In [19]:
sc.pl.spatial(adata_mel, color=["leiden_res_0.1","leiden_res_0.2","leiden_res_0.3","leiden_res_0.5"], spot_size=50)
No description has been provided for this image
In [61]:
sc.pl.spatial(adata_mel, spot_size=50, color='total_counts')
No description has been provided for this image
In [76]:
mel_marker_genes = {
    #"CRC": ["CEACAM5", "CDX2", "EPCAM", "KRT20", "MUC2", "CLDN1"],
    "melanoma": ["MITF", "TYR", "DCT", "MLANA", "PMEL", "SOX10","KRT5","PRAME","S100B"]
}
# Make unique if duplicates exist
sc.pl.dotplot(
    adata_mel,
    var_names=mel_marker_genes,
    groupby='leiden_res_0.5',
    standard_scale='var',
    color_map='Reds',
    show=True, use_raw=False, swap_axes=True
)
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:747: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
  dot_ax.scatter(x, y, **kwds)
No description has been provided for this image
In [ ]:
 
In [30]:
adata_mel_all = lnc_bin50_all[lnc_bin50_all.obs['Cancer'] == 'Melanoma'].copy()
adata_mel_all
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.
  warnings.warn("Transforming to str index.", ImplicitModificationWarning)
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.
  warnings.warn("Transforming to str index.", ImplicitModificationWarning)
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:1840: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
  utils.warn_names_duplicates("var")
Out[30]:
AnnData object with n_obs × n_vars = 31955 × 206923
    obs: 'total_counts', 'n_genes_by_counts', 'pct_counts_mt', 'leiden', 'orig.ident', 'x', 'y', 'Cancer'
    var: 'real_gene_name', 'n_cells', 'n_counts', 'mean_umi', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'ENS'
    uns: 'bin_size', 'bin_type', 'gene_exp_leiden', 'hvg', 'key_record', 'leiden_resolution', 'merged', 'neighbors', 'omics', 'pca_variance_ratio', 'rank_genes_groups', 'resolution', 'result_keys', 'sn'
    obsm: 'X_pca', 'X_umap', 'spatial'
    obsp: 'connectivities', 'distances'
In [38]:
adata_mel_all.obs['leiden_res_0.5']=adata_mel.obs['leiden_res_0.5']
adata_mel_all.obs
Out[38]:
total_counts n_genes_by_counts pct_counts_mt leiden orig.ident x y Cancer leiden_res_0.5
10093173156700 2 1 0.0 8 sample 2350 11100 Melanoma 1
10307921515800 1 1 0.0 8 sample 2400 5400 Melanoma 1
10307921516300 1 1 0.0 8 sample 2400 5900 Melanoma 1
10307921516600 3 1 0.0 8 sample 2400 6200 Melanoma 1
10307921517350 2 2 0.0 8 sample 2400 6950 Melanoma 1
... ... ... ... ... ... ... ... ... ...
43164421343700 22 20 0.0 6 sample 10050 18900 Melanoma 1
43164421343750 86 59 0.0 5 sample 10050 18950 Melanoma 21
43164421343800 8 8 0.0 8 sample 10050 19000 Melanoma 1
43379169708500 25 19 0.0 6 sample 10100 18900 Melanoma 18
43379169708550 27 24 0.0 6 sample 10100 18950 Melanoma 7

31955 rows × 9 columns

In [ ]:
 
In [33]:
# Step 0: Make gene names unique
adata_mel_all.var_names_make_unique()

# Step 1: Filter genes starting with "cuTAR"
cutar_genes = [gene for gene in adata_mel_all.var_names if gene.startswith("cuTAR")]

# Step 2: Subset AnnData for only cuTAR genes
adata_cutar = adata_mel_all[:, cutar_genes]
In [ ]:
 
In [41]:
import scanpy as sc
import pandas as pd

# Assume adata_mel_all is your AnnData object
# Step 1: Check cluster labels and gene names
print("Cluster labels:", adata_mel_all.obs['leiden_res_0.5'].value_counts())
print("First few gene names:", adata_mel_all.var_names[:5])

# Step 2: Preprocess data
# Check if data is raw counts
print("Sample of raw data:", adata_mel_all.X[:5, :5].toarray())  # Preview first 5 cells, 5 genes
# Normalize total counts (e.g., to 10,000 per cell) and logarithmize
sc.pp.normalize_total(adata_mel_all, target_sum=1e4)
sc.pp.log1p(adata_mel_all)  # Log-transform (log(1 + x))
print("Sample of normalized, log-transformed data:", adata_mel_all.X[:5, :5].toarray())

# Step 3: Filter genes starting with "cuTAR"
cuTAR_genes = [gene for gene in adata_mel_all.var_names if gene.startswith('cuTAR')]
print(f"Number of cuTAR genes: {len(cuTAR_genes)}")
print("First few cuTAR genes:", cuTAR_genes[:5])

# Step 4: Subset AnnData to only cuTAR genes
adata_cuTAR = adata_mel_all[:, cuTAR_genes].copy()

# Step 5: Run differential expression analysis for cluster 0
sc.tl.rank_genes_groups(adata_cuTAR, groupby='leiden_res_0.5', groups=['0'], reference='rest', method='wilcoxon')

# Step 6: Extract DE results
de_results = sc.get.rank_genes_groups_df(adata_cuTAR, group='0')
# Filter for highly expressed genes in cluster 0 (positive log fold change, significant p-value)
highly_de = de_results[
    (de_results['logfoldchanges'] > 0) &  # Upregulated in cluster 0
    (de_results['pvals_adj'] < 0.05)      # Adjusted p-value < 0.05 for significance
].sort_values('logfoldchanges', ascending=False)  # Sort by log fold change (highest first)

# Step 7: Display results
print("Highly differentially expressed cuTAR genes in cluster 0:")
print(highly_de[['names', 'logfoldchanges', 'pvals_adj']])

# Optional: Save results to a CSV file
#highly_de.to_csv('de_cuTAR_cluster0.csv', index=False)
Cluster labels: 0     4836
1     4248
2     1526
3     1289
4     1278
5     1227
6     1193
7     1159
8     1113
9     1065
10    1017
11     984
12     943
13     878
14     876
15     874
16     859
17     854
18     836
19     816
20     808
21     726
22     714
23     665
24     518
25     412
26     175
27      60
Name: leiden_res_0.5, dtype: int64
First few gene names: Index(['TSPAN6', 'TNMD', 'DPM1', 'SCYL3', 'C1orf112'], dtype='object', name='real_gene_name')
Sample of raw data: [[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
Sample of normalized, log-transformed data: [[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
Number of cuTAR genes: 172364
First few cuTAR genes: ['cuTAR1', 'cuTAR10', 'cuTAR100', 'cuTAR1000', 'cuTAR10000']
WARNING: It seems you use rank_genes_groups on the raw count data. Please logarithmize your data before calling rank_genes_groups.
Highly differentially expressed cuTAR genes in cluster 0:
           names  logfoldchanges      pvals_adj
0     cuTAR48563        5.034541   0.000000e+00
11    AC010967.1        3.745001   2.496113e-24
115          TYR        3.676590   1.014090e-02
1         RBFOX1        3.612103  1.175379e-148
25         PCSK2        3.520043   3.017661e-10
..           ...             ...            ...
69      cuTAR127        0.656923   1.623822e-04
27   cuTAR167235        0.615259   8.369745e-10
113  cuTAR100799        0.559353   6.501664e-03
56   cuTAR162694        0.546771   1.473693e-05
33   cuTAR170802        0.531583   5.421090e-08

[156 rows x 3 columns]
In [42]:
highly_de
Out[42]:
names scores logfoldchanges pvals pvals_adj
0 cuTAR48563 43.581669 5.034541 0.000000e+00 0.000000e+00
11 AC010967.1 11.087194 3.745001 1.447561e-28 2.496113e-24
115 TYR 4.537781 3.676590 5.684936e-06 1.014090e-02
1 RBFOX1 26.407085 3.612103 1.136055e-153 1.175379e-148
25 PCSK2 7.567946 3.520043 3.791710e-14 3.017661e-10
... ... ... ... ... ...
69 cuTAR127 5.434558 0.656923 5.493227e-08 1.623822e-04
27 cuTAR167235 7.424442 0.615259 1.132561e-13 8.369745e-10
113 cuTAR100799 4.634272 0.559353 3.581959e-06 6.501664e-03
56 cuTAR162694 5.881750 0.546771 4.059505e-09 1.473693e-05
33 cuTAR170802 6.823134 0.531583 8.907519e-12 5.421090e-08

156 rows × 5 columns

In [43]:
import scanpy as sc
import pandas as pd

# Assume adata_mel_all is your AnnData object
# Step 1: Check cluster labels and gene names
print("Cluster labels:", adata_mel_all.obs['leiden_res_0.5'].value_counts())
print("First few gene names:", adata_mel_all.var_names[:5])

# Step 2: Preprocess data
# Check if data is raw counts
print("Sample of raw data:", adata_mel_all.X[:5, :5].toarray())
# Normalize total counts and logarithmize
sc.pp.normalize_total(adata_mel_all, target_sum=1e4)
sc.pp.log1p(adata_mel_all)
print("Sample of normalized, log-transformed data:", adata_mel_all.X[:5, :5].toarray())

# Step 3: Filter genes starting with "cuTAR"
cuTAR_genes = [gene for gene in adata_mel_all.var_names if gene.startswith('cuTAR')]
print(f"Number of cuTAR genes: {len(cuTAR_genes)}")
print("First few cuTAR genes:", cuTAR_genes[:5])

# Step 4: Subset AnnData to only cuTAR genes
adata_cuTAR = adata_mel_all[:, cuTAR_genes].copy()

# Step 5: Run differential expression analysis for cluster 0
sc.tl.rank_genes_groups(adata_cuTAR, groupby='leiden_res_0.5', groups=['0'], reference='rest', method='wilcoxon')

# Step 6: Extract DE results
de_results = sc.get.rank_genes_groups_df(adata_cuTAR, group='0')
# Filter for highly expressed genes in cluster 0 and only cuTAR genes
highly_de_cuTAR = de_results[
    (de_results['logfoldchanges'] > 0) &  # Upregulated in cluster 0
    (de_results['pvals_adj'] < 0.05) &    # Adjusted p-value < 0.05 for significance
    (de_results['names'].str.startswith('cuTAR'))  # Only cuTAR genes
].sort_values('logfoldchanges', ascending=False)  # Sort by log fold change (highest first)

# Step 7: Display results
print("Highly differentially expressed cuTAR genes in cluster 0:")
print(highly_de_cuTAR[['names', 'logfoldchanges', 'pvals_adj']])

# Optional: Save results to a CSV file
#
Cluster labels: 0     4836
1     4248
2     1526
3     1289
4     1278
5     1227
6     1193
7     1159
8     1113
9     1065
10    1017
11     984
12     943
13     878
14     876
15     874
16     859
17     854
18     836
19     816
20     808
21     726
22     714
23     665
24     518
25     412
26     175
27      60
Name: leiden_res_0.5, dtype: int64
First few gene names: Index(['TSPAN6', 'TNMD', 'DPM1', 'SCYL3', 'C1orf112'], dtype='object', name='real_gene_name')
Sample of raw data: [[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
WARNING: adata.X seems to be already log-transformed.
Sample of normalized, log-transformed data: [[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
Number of cuTAR genes: 172364
First few cuTAR genes: ['cuTAR1', 'cuTAR10', 'cuTAR100', 'cuTAR1000', 'cuTAR10000']
WARNING: It seems you use rank_genes_groups on the raw count data. Please logarithmize your data before calling rank_genes_groups.
Highly differentially expressed cuTAR genes in cluster 0:
           names  logfoldchanges     pvals_adj
0     cuTAR48563        5.034541  0.000000e+00
24    cuTAR48562        1.592207  2.117516e-10
50   cuTAR141754        1.585270  5.831195e-06
62   cuTAR141755        1.205844  5.697473e-05
4    cuTAR167240        0.919206  3.277730e-40
8     cuTAR81095        0.915182  1.242874e-30
3        cuTAR64        0.891957  3.858411e-46
17   cuTAR167236        0.833358  3.023247e-15
14       cuTAR39        0.764167  1.916128e-15
21   cuTAR167239        0.746357  2.033063e-11
15   cuTAR162573        0.692954  2.232043e-15
12       cuTAR63        0.686892  1.104622e-20
69      cuTAR127        0.656923  1.623822e-04
27   cuTAR167235        0.615259  8.369745e-10
113  cuTAR100799        0.559353  6.501664e-03
56   cuTAR162694        0.546771  1.473693e-05
33   cuTAR170802        0.531583  5.421090e-08
In [44]:
highly_de_cuTAR.to_csv('de_cuTAR_cluster0_bin50_mel.csv', index=False)
In [45]:
highly_de_cuTAR
Out[45]:
names scores logfoldchanges pvals pvals_adj
0 cuTAR48563 43.581669 5.034541 0.000000e+00 0.000000e+00
24 cuTAR48562 7.618906 1.592207 2.558338e-14 2.117516e-10
50 cuTAR141754 6.051256 1.585270 1.437206e-09 5.831195e-06
62 cuTAR141755 5.636575 1.205844 1.734659e-08 5.697473e-05
4 cuTAR167240 14.048035 0.919206 7.920169e-45 3.277730e-40
8 cuTAR81095 12.341619 0.915182 5.405811e-35 1.242874e-30
3 cuTAR64 14.998953 0.891957 7.458642e-51 3.858411e-46
17 cuTAR167236 8.983207 0.833358 2.629889e-19 3.023247e-15
14 cuTAR39 9.053150 0.764167 1.389016e-19 1.916128e-15
21 cuTAR167239 7.931706 0.746357 2.161548e-15 2.033063e-11
15 cuTAR162573 9.029417 0.692954 1.725892e-19 2.232043e-15
12 cuTAR63 10.301462 0.686892 6.939823e-25 1.104622e-20
69 cuTAR127 5.434558 0.656923 5.493227e-08 1.623822e-04
27 cuTAR167235 7.424442 0.615259 1.132561e-13 8.369745e-10
113 cuTAR100799 4.634272 0.559353 3.581959e-06 6.501664e-03
56 cuTAR162694 5.881750 0.546771 4.059505e-09 1.473693e-05
33 cuTAR170802 6.823134 0.531583 8.907519e-12 5.421090e-08
In [73]:
sc.pl.spatial(adata_mel_all, color=['cuTAR48562','cuTAR141755'], cmap="inferno", spot_size=70, vmax=2) #reds spot 70, vmax=5
No description has been provided for this image
In [48]:
# Make unique if duplicates exist
sc.pl.dotplot(
    adata_mel,
    var_names=highly_de_cuTAR['names'],
    groupby='leiden_res_0.5',
    standard_scale='var',
    dot_max=0.5,
    color_map='Reds',
    show=True, use_raw=False, swap_axes=True
)
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:747: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
  dot_ax.scatter(x, y, **kwds)
No description has been provided for this image
In [82]:
genes_to_plot = [
    # Innate immune sensors
    "TLR1", "TLR2", "TLR3", "TLR4", "TLR5", "TLR7", "TLR8", "TLR9",
    "NOD1", "NOD2", "RIGI", "MDA5", "STING1",

    # Cytokines and receptors
    "IL1B", "IL6", "IL10", "IL12A", "IL12B", "TNF", "IFNA1", "IFNB1", "IFNG",
    "CXCL8", "CCL2", "CCL3", "CXCL10", "CCR5", "CXCR3",

    # Antigen presentation
    "HLA-A", "HLA-B", "HLA-C", "HLA-DRA", "HLA-DRB1", "CD74", "TAP1", "TAP2",

    # T cell markers
    "CD3D", "CD3E", "CD4", "CD8A", "CD8B", "IL2RA", "FOXP3", "CTLA4",

    # B cell markers
    "CD19", "CD20", "CD79A", "CD79B", "MS4A1",

    # NK cell markers
    "NCAM1", "KLRD1", "NKG7", "GNLY", "GZMB", "PRF1",

    # Macrophage/monocyte markers
    "CD14", "CD68", "CD163", "CSF1R", "ITGAM",

    # Dendritic cell markers
    "ITGAX", "CD80", "CD86", "CCR7",

    # Inflammation and signaling
    "STAT1", "STAT3", "IRF1", "IRF3", "IRF7", "NFKB1", "RELA"
]



# Ensure genes are in adata.var_names
genes_to_plot = [gene for gene in genes_to_plot if gene in adata_mel.var_names]
if not genes_to_plot:
    raise ValueError("None of the specified genes are in adata.var_names")

sc.pl.dotplot(
    adata_mel,
    var_names=genes_to_plot,
    groupby='leiden_res_0.5',  # Assumes adata.obs['organ'] has 'salivary_gland', 'midgut'
    standard_scale='var',  # Scales expression to [0, 1] per gene
    color_map='Reds',  # Color map for mean expression
    show=True
)
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:747: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
  dot_ax.scatter(x, y, **kwds)
No description has been provided for this image
In [97]:
import pickle

# Save
with open("lnc_Melanoma_processed_bin50.pkl", "wb") as f:
    pickle.dump(adata_mel, f)


with open("lnc_Melanoma_processed_bin50_with_cuTARs.pkl", "wb") as f:
    pickle.dump(adata_mel_all, f)
In [ ]:
 
In [ ]:
 
In [ ]:
 

GBM¶

In [83]:
adata_GBM = lnc_bin50[lnc_bin50.obs['Cancer'] == 'GBM'].copy()
adata_GBM
Out[83]:
AnnData object with n_obs × n_vars = 30319 × 34559
    obs: 'total_counts', 'n_genes_by_counts', 'pct_counts_mt', 'leiden', 'orig.ident', 'x', 'y', 'Cancer'
    var: 'real_gene_name', 'n_cells', 'n_counts', 'mean_umi', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'ENS'
    uns: 'bin_size', 'bin_type', 'gene_exp_leiden', 'hvg', 'key_record', 'leiden_resolution', 'merged', 'neighbors', 'omics', 'pca_variance_ratio', 'rank_genes_groups', 'resolution', 'result_keys', 'sn'
    obsm: 'X_pca', 'X_umap', 'spatial'
    obsp: 'connectivities', 'distances'
In [84]:
# Normalizing to median total counts
sc.pp.normalize_total(adata_GBM)
# Logarithmize the data
sc.pp.log1p(adata_GBM)
fig, axs = plt.subplots(1, 4, figsize=(15, 4))
sns.histplot(adata_GBM.obs["total_counts"], kde=False, ax=axs[0])
sns.histplot(
    adata_GBM.obs["total_counts"][adata_GBM.obs["total_counts"] < 10000],
    kde=False,
    bins=40,
    ax=axs[1],
)
sns.histplot(adata_GBM.obs["n_genes_by_counts"], kde=False, bins=60, ax=axs[2])
sns.histplot(
    adata_GBM.obs["n_genes_by_counts"][adata_GBM.obs["n_genes_by_counts"] < 4000],
    kde=False,
    bins=60,
    ax=axs[3],
)
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/preprocessing/_normalization.py:196: UserWarning: Some cells have zero counts
  warn(UserWarning('Some cells have zero counts'))
Out[84]:
<Axes: xlabel='n_genes_by_counts', ylabel='Count'>
No description has been provided for this image
In [85]:
sc.pp.filter_cells(adata_GBM, max_counts=500)
adata_GBM = adata_GBM[adata_GBM.obs["pct_counts_mt"] < 20].copy()
print(f"#cells after MT filter: {adata_GBM.n_obs}")
sc.pp.filter_genes(adata_GBM, min_cells=10)
adata_GBM
#cells after MT filter: 30306
Out[85]:
AnnData object with n_obs × n_vars = 30306 × 22930
    obs: 'total_counts', 'n_genes_by_counts', 'pct_counts_mt', 'leiden', 'orig.ident', 'x', 'y', 'Cancer', 'n_counts'
    var: 'real_gene_name', 'n_cells', 'n_counts', 'mean_umi', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'ENS'
    uns: 'bin_size', 'bin_type', 'gene_exp_leiden', 'hvg', 'key_record', 'leiden_resolution', 'merged', 'neighbors', 'omics', 'pca_variance_ratio', 'rank_genes_groups', 'resolution', 'result_keys', 'sn', 'log1p'
    obsm: 'X_pca', 'X_umap', 'spatial'
    obsp: 'connectivities', 'distances'
In [86]:
sc.tl.pca(adata_GBM, use_highly_variable=False)
sc.pp.neighbors(adata_GBM)
sc.tl.umap(adata_GBM)
In [87]:
sc.tl.leiden(adata_GBM, key_added="leiden_res_0.2", resolution=0.2)
sc.pl.umap(adata_GBM, color=["leiden_res_0.2"],legend_loc="on data")
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
No description has been provided for this image
In [94]:
sc.tl.leiden(adata_GBM, key_added="leiden_res_0.3", resolution=0.3)
sc.pl.umap(adata_GBM, color=["leiden_res_0.3"],legend_loc="on data")
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
No description has been provided for this image
In [95]:
sc.pl.spatial(adata_GBM, color=["leiden_res_0.2","leiden_res_0.3","leiden_res_0.5"], spot_size=70)
#sc.pl.spatial(adata_GBM, color=["leiden_res_0.1","leiden_res_0.2","leiden_res_0.3","leiden_res_0.5"], spot_size=50)
No description has been provided for this image
In [96]:
gbm_marker_genes = [
    # General GBM markers
    "EGFR", "PDGFRA", "IDH1", "TP53", "NF1", "MGMT", "MDM2", "MDM4",
    # Proliferation / stemness
    "MKI67", "PROM1", "NES",
    # Mesenchymal subtype / immune interaction
    "CD44", "STAT3", "TNFAIP3", "CD274",
    # Proneural subtype
    "PDGFRA",
    # Classical subtype
    "EGFR", "NES", "BCAN",
    # Neural subtype (less reliable in bulk)
    "SLC12A5", "SYT1", "SNAP25",
    # Microenvironment / immune
    "ITGAM", "CX3CR1", "PTPRC", "CD163", "CSF1R", "HLA-DRA",
    # Astrocytic / glial markers
    "AQP4",
    # Oligodendrocyte / OPC
    "PDGFRA", "CSPG4", "MBP"
]


# Ensure genes are in adata.var_names
genes_to_plot = [gene for gene in genes_to_plot if gene in adata_GBM.var_names]
if not genes_to_plot:
    raise ValueError("None of the specified genes are in adata.var_names")
sc.pl.dotplot(
    adata_GBM,
    var_names=gbm_marker_genes,
    groupby='leiden_res_0.3',
    standard_scale='var',
    color_map='Reds',
    show=True, use_raw=False
)
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:747: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
  dot_ax.scatter(x, y, **kwds)
No description has been provided for this image

CRC¶

In [10]:
adata_CRC = lnc_bin50[lnc_bin50.obs['Cancer'] == 'Colorectal_cancer'].copy()
adata_CRC
Out[10]:
AnnData object with n_obs × n_vars = 21301 × 34559
    obs: 'total_counts', 'n_genes_by_counts', 'pct_counts_mt', 'leiden', 'orig.ident', 'x', 'y', 'Cancer'
    var: 'real_gene_name', 'n_cells', 'n_counts', 'mean_umi', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'ENS'
    uns: 'bin_size', 'bin_type', 'gene_exp_leiden', 'hvg', 'key_record', 'leiden_resolution', 'merged', 'neighbors', 'omics', 'pca_variance_ratio', 'rank_genes_groups', 'resolution', 'result_keys', 'sn'
    obsm: 'X_pca', 'X_umap', 'spatial'
    obsp: 'connectivities', 'distances'
In [11]:
# Normalizing to median total counts
sc.pp.normalize_total(adata_CRC)
# Logarithmize the data
sc.pp.log1p(adata_CRC)
fig, axs = plt.subplots(1, 4, figsize=(15, 4))
sns.histplot(adata_CRC.obs["total_counts"], kde=False, ax=axs[0])
sns.histplot(
    adata_CRC.obs["total_counts"][adata_CRC.obs["total_counts"] < 10000],
    kde=False,
    bins=40,
    ax=axs[1],
)
sns.histplot(adata_CRC.obs["n_genes_by_counts"], kde=False, bins=60, ax=axs[2])
sns.histplot(
    adata_CRC.obs["n_genes_by_counts"][adata_CRC.obs["n_genes_by_counts"] < 4000],
    kde=False,
    bins=60,
    ax=axs[3],
)
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/preprocessing/_normalization.py:196: UserWarning: Some cells have zero counts
  warn(UserWarning('Some cells have zero counts'))
Out[11]:
<Axes: xlabel='n_genes_by_counts', ylabel='Count'>
No description has been provided for this image
In [12]:
sc.pp.filter_cells(adata_CRC, max_counts=400)
adata_CRC = adata_CRC[adata_CRC.obs["pct_counts_mt"] < 20].copy()
print(f"#cells after MT filter: {adata_CRC.n_obs}")
sc.pp.filter_genes(adata_CRC, min_cells=10)
adata_CRC
#cells after MT filter: 21294
Out[12]:
AnnData object with n_obs × n_vars = 21294 × 18284
    obs: 'total_counts', 'n_genes_by_counts', 'pct_counts_mt', 'leiden', 'orig.ident', 'x', 'y', 'Cancer', 'n_counts'
    var: 'real_gene_name', 'n_cells', 'n_counts', 'mean_umi', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'ENS'
    uns: 'bin_size', 'bin_type', 'gene_exp_leiden', 'hvg', 'key_record', 'leiden_resolution', 'merged', 'neighbors', 'omics', 'pca_variance_ratio', 'rank_genes_groups', 'resolution', 'result_keys', 'sn', 'log1p'
    obsm: 'X_pca', 'X_umap', 'spatial'
    obsp: 'connectivities', 'distances'
In [13]:
sc.tl.pca(adata_CRC, use_highly_variable=False)
sc.pp.neighbors(adata_CRC)
sc.tl.umap(adata_CRC)
2025-06-04 11:58:57.395591: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-06-04 11:59:02.329185: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2025-06-04 11:59:02.329240: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2025-06-04 11:59:57.703533: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2025-06-04 11:59:57.703662: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
2025-06-04 11:59:57.703672: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
In [20]:
sc.tl.leiden(adata_CRC, key_added="leiden_res_0.3", resolution=0.3)
sc.pl.umap(adata_CRC, color=["leiden_res_0.3"],legend_loc="on data")
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
No description has been provided for this image
In [24]:
sc.pl.spatial(adata_CRC, color=["leiden_res_0.2","leiden_res_0.3"], spot_size=60) #,"leiden_res_0.5"
No description has been provided for this image
In [22]:
CRC_marker_genes = [
    "APC",           # Tumor suppressor, often mutated early
    "TP53",          # Tumor suppressor, frequently mutated
    "KRAS",          # Oncogene, common driver mutation
    "BRAF",          # Oncogene, V600E mutation often in MSI-high CRC
    "PIK3CA",        # Involved in PI3K-AKT signaling
    "SMAD4",         # TGF-beta signaling, tumor suppressor
    "MLH1",          # DNA mismatch repair gene
    "MSH2",          # Mismatch repair gene, Lynch syndrome
    "MSH6",          # Mismatch repair gene
    "PMS2",          # Mismatch repair gene
    "CDX2",          # Differentiation marker for intestinal epithelium
    "CEACAM5",       # CEA – used as a serum marker
    "EPCAM",         # Cell adhesion, involved in Lynch syndrome
    "LGR5",          # Stem cell marker in colorectal epithelium
    "AXIN2",         # Wnt signaling pathway component
    "CTNNB1",        # β-catenin, Wnt pathway
    "MET",           # Receptor tyrosine kinase, prognostic
    "VEGFA",         # Angiogenesis, target of bevacizumab
    "EGFR",          # Target of cetuximab/panitumumab
    "ALDH1A1",       # Cancer stem cell marker
    "SLC6A6",        # Overexpressed in CRC
    "MMP7",          # Matrix metalloproteinase, invasion marker
    "TNFRSF10B",     # Death receptor pathway
    "CXCL1",         # Inflammatory chemokine, pro-tumorigenic
    "CD44",          # Stem cell and EMT marker
]

genes_to_plot=CRC_marker_genes
# Ensure genes are in adata.var_names
genes_to_plot = [gene for gene in genes_to_plot if gene in adata_CRC.var_names]
if not genes_to_plot:
    raise ValueError("None of the specified genes are in adata.var_names")
sc.pl.dotplot(
    adata_CRC,
    var_names=CRC_marker_genes,
    groupby='leiden_res_0.3',
    standard_scale='var',
    color_map='Reds',
    show=True, use_raw=False
)
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:747: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
  dot_ax.scatter(x, y, **kwds)
No description has been provided for this image
In [45]:
sc.pl.dotplot(
    adata_CRC,
    var_names=CRC_marker_genes,
    groupby='leiden_res_0.2',
    standard_scale='var',
    color_map='Reds',
    show=True, use_raw=False
)
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:747: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
  dot_ax.scatter(x, y, **kwds)
No description has been provided for this image
In [26]:
adata_CRC_all = lnc_bin50_all[lnc_bin50_all.obs['Cancer'] == 'Colorectal_cancer'].copy()

adata_CRC_all.obs['leiden_res_0.3']=adata_CRC.obs['leiden_res_0.3']
adata_CRC_all.obs

# Step 0: Make gene names unique
adata_CRC_all.var_names_make_unique()

# Step 1: Filter genes starting with "cuTAR"
cutar_genes = [gene for gene in adata_CRC_all.var_names if gene.startswith("cuTAR")]

# Step 2: Subset AnnData for only cuTAR genes
adata_cutar = adata_CRC_all[:, cutar_genes]
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.
  warnings.warn("Transforming to str index.", ImplicitModificationWarning)
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.
  warnings.warn("Transforming to str index.", ImplicitModificationWarning)
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:1840: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
  utils.warn_names_duplicates("var")
In [40]:
adata_CRC_all
Out[40]:
AnnData object with n_obs × n_vars = 21301 × 206923
    obs: 'total_counts', 'n_genes_by_counts', 'pct_counts_mt', 'leiden', 'orig.ident', 'x', 'y', 'Cancer', 'leiden_res_0.3'
    var: 'real_gene_name', 'n_cells', 'n_counts', 'mean_umi', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'ENS'
    uns: 'bin_size', 'bin_type', 'gene_exp_leiden', 'hvg', 'key_record', 'leiden_resolution', 'merged', 'neighbors', 'omics', 'pca_variance_ratio', 'rank_genes_groups', 'resolution', 'result_keys', 'sn', 'log1p'
    obsm: 'X_pca', 'X_umap', 'spatial'
    obsp: 'connectivities', 'distances'
In [42]:
import scanpy as sc
import pandas as pd
adata_CRC_all.obs['leiden_res_0.2']=adata_CRC.obs['leiden_res_0.2']

# Assume adata_mel_all is your AnnData object
# Step 1: Check cluster labels and gene names
print("Cluster labels:", adata_CRC_all.obs['leiden_res_0.2'].value_counts())
print("First few gene names:", adata_CRC_all.var_names[:5])

# Step 2: Preprocess data
# Check if data is raw counts
print("Sample of raw data:", adata_CRC_all.X[:5, :5].toarray())
# Normalize total counts and logarithmize
sc.pp.normalize_total(adata_CRC_all, target_sum=1e4)
sc.pp.log1p(adata_CRC_all)
print("Sample of normalized, log-transformed data:", adata_CRC_all.X[:5, :5].toarray())

# Step 3: Filter genes starting with "cuTAR"
cuTAR_genes = [gene for gene in adata_CRC_all.var_names if gene.startswith('cuTAR')]
print(f"Number of cuTAR genes: {len(cuTAR_genes)}")
print("First few cuTAR genes:", cuTAR_genes[:5])

# Step 4: Subset AnnData to only cuTAR genes
adata_cuTAR = adata_CRC_all[:, cuTAR_genes].copy()
Cluster labels: 0    17567
1     2796
2      931
Name: leiden_res_0.2, dtype: int64
First few gene names: Index(['TSPAN6', 'TNMD', 'DPM1', 'SCYL3', 'C1orf112'], dtype='object', name='real_gene_name')
Sample of raw data: [[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
WARNING: adata.X seems to be already log-transformed.
Sample of normalized, log-transformed data: [[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
Number of cuTAR genes: 172364
First few cuTAR genes: ['cuTAR1', 'cuTAR10', 'cuTAR100', 'cuTAR1000', 'cuTAR10000']
In [32]:
# Step 5: Run differential expression analysis for cluster 0
sc.tl.rank_genes_groups(adata_cuTAR, groupby='leiden_res_0.3', groups=['2'], reference='rest', method='wilcoxon')

# Step 6: Extract DE results
de_results = sc.get.rank_genes_groups_df(adata_cuTAR, group='2')
# Filter for highly expressed genes in cluster 0 and only cuTAR genes
highly_de_cuTAR = de_results[
    (de_results['logfoldchanges'] > 0) &  # Upregulated in cluster 0
    (de_results['pvals_adj'] < 0.05) &    # Adjusted p-value < 0.05 for significance
    (de_results['names'].str.startswith('cuTAR'))  # Only cuTAR genes
].sort_values('logfoldchanges', ascending=False)  # Sort by log fold change (highest first)

# Step 7: Display results
print("Highly differentially expressed cuTAR genes in cluster 2:")
print(highly_de_cuTAR[['names', 'logfoldchanges', 'pvals_adj']])

# Optional: Save results to a CSV file
#
WARNING: It seems you use rank_genes_groups on the raw count data. Please logarithmize your data before calling rank_genes_groups.
Highly differentially expressed cuTAR genes in cluster 2:
        names  logfoldchanges     pvals_adj
4  cuTAR48563        1.608611  1.736617e-45
In [35]:
#highly_de_cuTAR.to_csv('de_cuTAR_cluster0_bin50_CRC.csv', index=False)
sc.pl.spatial(adata_CRC_all, color=['cuTAR48563'], cmap="inferno", spot_size=70) #reds spot 70, vmax=5
# Make unique if duplicates exist
No description has been provided for this image
In [36]:
sc.pl.dotplot(
    adata_CRC_all,
    var_names=highly_de_cuTAR['names'],
    groupby='leiden_res_0.3',
    standard_scale='var',
    color_map='Reds',
    show=True, use_raw=False, swap_axes=True
)
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:747: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
  dot_ax.scatter(x, y, **kwds)
No description has been provided for this image
In [43]:
# Step 5: Run differential expression analysis for cluster 0
sc.tl.rank_genes_groups(adata_cuTAR, groupby='leiden_res_0.2', groups=['0'], reference='rest', method='wilcoxon')

# Step 6: Extract DE results
de_results = sc.get.rank_genes_groups_df(adata_cuTAR, group='0')
# Filter for highly expressed genes in cluster 0 and only cuTAR genes
highly_de_cuTAR = de_results[
    (de_results['logfoldchanges'] > 0) &  # Upregulated in cluster 0
    (de_results['pvals_adj'] < 0.05) &    # Adjusted p-value < 0.05 for significance
    (de_results['names'].str.startswith('cuTAR'))  # Only cuTAR genes
].sort_values('logfoldchanges', ascending=False)  # Sort by log fold change (highest first)

# Step 7: Display results
print("Highly differentially expressed cuTAR genes in cluster 2:")
print(highly_de_cuTAR[['names', 'logfoldchanges', 'pvals_adj']])

# Optional: Save results to a CSV file
#
WARNING: It seems you use rank_genes_groups on the raw count data. Please logarithmize your data before calling rank_genes_groups.
Highly differentially expressed cuTAR genes in cluster 2:
          names  logfoldchanges      pvals_adj
0    cuTAR48563        2.551227  1.066937e-164
6      cuTAR127        0.810377   1.961647e-07
16   cuTAR81095        0.642386   1.625657e-03
3       cuTAR64        0.494573   5.932891e-09
22  cuTAR167240        0.402884   1.595125e-02
In [44]:
sc.pl.dotplot(
    adata_CRC_all,
    var_names=highly_de_cuTAR['names'],
    groupby='leiden_res_0.2',
    standard_scale='var',
    color_map='Reds',
    show=True, use_raw=False, swap_axes=True
)
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:747: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
  dot_ax.scatter(x, y, **kwds)
No description has been provided for this image
In [46]:
sc.pl.spatial(adata_CRC_all, color=['cuTAR48563','cuTAR127','cuTAR167240'], cmap="inferno", spot_size=70) #reds spot 70, vmax=5
No description has been provided for this image

IBD¶

In [98]:
import pickle as pkl
Path_raw_data_2 = "/QRISdata/Q1851/STOmics_SAW_Outputs/IBD_new/"
with open(Path_raw_data_2+"A04627E2_bin20.pkl", "rb") as f:
    IBD = pkl.load(f)
IBD
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[98], line 3
      1 import pickle as pkl
      2 Path_raw_data_2 = "/QRISdata/Q1851/STOmics_SAW_Outputs/IBD_new/"
----> 3 with open(Path_raw_data_2+"A04627E2_bin20.pkl", "rb") as f:
      4     IBD = pkl.load(f)
      5 IBD

File ~/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py:284, in _modified_open(file, *args, **kwargs)
    277 if file in {0, 1, 2}:
    278     raise ValueError(
    279         f"IPython won't let you open fd={file} by default "
    280         "as it is likely to crash IPython. If you know what you are doing, "
    281         "you can use builtins' open."
    282     )
--> 284 return io_open(file, *args, **kwargs)

FileNotFoundError: [Errno 2] No such file or directory: '/QRISdata/Q1851/STOmics_SAW_Outputs/IBD_new/A04627E2_bin20.pkl'
In [47]:
sc.pl.spatial(adata_CRC_all, color=['cuTAR176226'], cmap="inferno", spot_size=70) #reds spot 70, vmax=5
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File ~/.local/lib/python3.8/site-packages/pandas/core/indexes/base.py:3802, in Index.get_loc(self, key, method, tolerance)
   3801 try:
-> 3802     return self._engine.get_loc(casted_key)
   3803 except KeyError as err:

File ~/.local/lib/python3.8/site-packages/pandas/_libs/index.pyx:138, in pandas._libs.index.IndexEngine.get_loc()

File ~/.local/lib/python3.8/site-packages/pandas/_libs/index.pyx:162, in pandas._libs.index.IndexEngine.get_loc()

File ~/.local/lib/python3.8/site-packages/pandas/_libs/index.pyx:203, in pandas._libs.index.IndexEngine._get_loc_duplicates()

File ~/.local/lib/python3.8/site-packages/pandas/_libs/index.pyx:211, in pandas._libs.index.IndexEngine._maybe_get_bool_indexer()

File ~/.local/lib/python3.8/site-packages/pandas/_libs/index.pyx:107, in pandas._libs.index._unpack_bool_indexer()

KeyError: 'cuTAR176226'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[47], line 1
----> 1 sc.pl.spatial(adata_CRC_all, color=['cuTAR176226'], cmap="inferno", spot_size=70) #reds spot 70, vmax=5

File ~/.local/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:1026, in spatial(adata, basis, img, img_key, library_id, crop_coord, alpha_img, bw, size, scale_factor, spot_size, na_color, show, return_fig, save, **kwargs)
   1023     cmap_img = None
   1024 circle_radius = size * scale_factor * spot_size * 0.5
-> 1026 axs = embedding(
   1027     adata,
   1028     basis=basis,
   1029     scale_factor=scale_factor,
   1030     size=circle_radius,
   1031     na_color=na_color,
   1032     show=False,
   1033     save=False,
   1034     **kwargs,
   1035 )
   1036 if not isinstance(axs, list):
   1037     axs = [axs]

File ~/.local/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:259, in embedding(adata, basis, color, gene_symbols, use_raw, sort_order, edges, edges_width, edges_color, neighbors_key, arrows, arrows_kwds, groups, components, dimensions, layer, projection, scale_factor, color_map, cmap, palette, na_color, na_in_legend, size, frameon, legend_fontsize, legend_fontweight, legend_loc, legend_fontoutline, colorbar_loc, vmax, vmin, vcenter, norm, add_outline, outline_width, outline_color, ncols, hspace, wspace, title, show, save, ax, return_fig, **kwargs)
    252 # use itertools.product to make a plot for each color and for each component
    253 # For example if color=[gene1, gene2] and components=['1,2, '2,3'].
    254 # The plots are: [
    255 #     color=gene1, components=[1,2], color=gene1, components=[2,3],
    256 #     color=gene2, components = [1, 2], color=gene2, components=[2,3],
    257 # ]
    258 for count, (value_to_plot, dims) in enumerate(zip(color, dimensions)):
--> 259     color_source_vector = _get_color_source_vector(
    260         adata,
    261         value_to_plot,
    262         layer=layer,
    263         use_raw=use_raw,
    264         gene_symbols=gene_symbols,
    265         groups=groups,
    266     )
    267     color_vector, categorical = _color_vector(
    268         adata,
    269         value_to_plot,
   (...)
    272         na_color=na_color,
    273     )
    275     # Order points

File ~/.local/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:1195, in _get_color_source_vector(adata, value_to_plot, use_raw, gene_symbols, layer, groups)
   1191     value_to_plot = adata.var.index[adata.var[gene_symbols] == value_to_plot][
   1192         0
   1193     ]  # TODO: Throw helpful error if this doesn't work
   1194 if use_raw and value_to_plot not in adata.obs.columns:
-> 1195     values = adata.raw.obs_vector(value_to_plot)
   1196 else:
   1197     values = adata.obs_vector(value_to_plot, layer=layer)

File ~/.local/lib/python3.8/site-packages/anndata/_core/raw.py:171, in Raw.obs_vector(self, k)
    169 def obs_vector(self, k: str) -> np.ndarray:
    170     # TODO decorator to copy AnnData.obs_vector docstring
--> 171     idx = self._normalize_indices((slice(None), k))
    172     a = self.X[idx]
    173     if issparse(a):

File ~/.local/lib/python3.8/site-packages/anndata/_core/raw.py:162, in Raw._normalize_indices(self, packed_index)
    160 obs, var = unpack_index(packed_index)
    161 obs = _normalize_index(obs, self._adata.obs_names)
--> 162 var = _normalize_index(var, self.var_names)
    163 return obs, var

File ~/.local/lib/python3.8/site-packages/anndata/_core/index.py:72, in _normalize_index(indexer, index)
     70     return indexer
     71 elif isinstance(indexer, str):
---> 72     return index.get_loc(indexer)  # int
     73 elif isinstance(indexer, (Sequence, np.ndarray, pd.Index, spmatrix, np.matrix)):
     74     if hasattr(indexer, "shape") and (
     75         (indexer.shape == (index.shape[0], 1))
     76         or (indexer.shape == (1, index.shape[0]))
     77     ):

File ~/.local/lib/python3.8/site-packages/pandas/core/indexes/base.py:3804, in Index.get_loc(self, key, method, tolerance)
   3802     return self._engine.get_loc(casted_key)
   3803 except KeyError as err:
-> 3804     raise KeyError(key) from err
   3805 except TypeError:
   3806     # If we have a listlike key, _check_indexing_error will raise
   3807     #  InvalidIndexError. Otherwise we fall through and re-raise
   3808     #  the TypeError.
   3809     self._check_indexing_error(key)

KeyError: 'cuTAR176226'
No description has been provided for this image
In [50]:
lnc_bin50_all.var
Out[50]:
real_gene_name n_cells n_counts mean_umi means dispersions dispersions_norm highly_variable ENS
real_gene_name
TSPAN6 TSPAN6 49 82 1.673469 0.050039 5.026673 0.563557 False ENSG00000000003
TNMD TNMD 22 42 1.909091 0.029200 5.252887 0.912412 False ENSG00000000005
DPM1 DPM1 138 248 1.797101 0.152459 5.489168 1.276791 False ENSG00000000419
SCYL3 SCYL3 259 450 1.737452 0.251395 5.070881 -0.563027 False ENSG00000000457
C1orf112 C1orf112 420 686 1.633333 0.361828 5.072034 -0.560698 False ENSG00000000460
... ... ... ... ... ... ... ... ... ...
cuTAR99983 cuTAR99983 1 3 3.000000 0.001482 4.820015 0.244860 False cuTAR99983
cuTAR99984 cuTAR99984 2 2 1.000000 0.001233 4.029741 -0.973857 False cuTAR99984
cuTAR99985 cuTAR99985 3 9 3.000000 0.008704 5.941693 1.974651 False cuTAR99985
cuTAR99986 cuTAR99986 17 30 1.764706 0.021736 5.294215 0.976146 False cuTAR99986
cuTAR99988 cuTAR99988 1 1 1.000000 0.000319 3.283414 -2.124802 False cuTAR99988

206923 rows × 9 columns

In [53]:
# Check for matching rows in the .var dataframe
lnc_bin50_all.var[lnc_bin50_all.var.index.str.contains("cuTAR176226", case=False)]
Out[53]:
real_gene_name n_cells n_counts mean_umi means dispersions dispersions_norm highly_variable ENS
real_gene_name
In [ ]: